/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"

.text
.align 5

//void WinogradTransLeft(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
//x0: S
//x1: B
//x2: M
//x3: w
//x4: h
//x5: k
//x6: length
asm_function WinogradTransLeft
    push {r4-r11, lr}
    ldr r4, [sp, #36]
    ldr r5, [sp, #40]
    ldr r6, [sp, #44]

    mov r8, #16 // 4 * sizeof(float)
    mul r8, r6, r8
    mul r9, r3, r8
    sub r9, r9, r8
    add r7, r9, r8 // step for S
    mov r10, #4
    mul r10, r4, r10 // step for B

LoopH:
    push {r0, r3}
    LoopW:
        push {r0, r1}
        vmov.i32 q14, #0
        mov r11, r6
        InitZero:
            vst1.32 {q14}, [r2]!
            subs r11, r11, #1
            bne InitZero

        sub r2, r2, r8
        mov r12, r5

        LoopKStart7:
            cmp r12, #7
            blt LoopKStart4
            push {r3-r7}
        LoopK7:
            vld1.32 {d0[0]}, [r1], r10
            vld1.32 {d0[1]}, [r1], r10
            vld1.32 {d1[0]}, [r1], r10
            vld1.32 {d1[1]}, [r1], r10
            vld1.32 {d2[0]}, [r1], r10
            vld1.32 {d2[1]}, [r1], r10
            vld1.32 {d3[0]}, [r1], r10
            mov r11, r6
            vmov.32 d30[0], r1

            add r1, r0, r7
            add r3, r1, r7
            add r4, r3, r7
            add r5, r4, r7
            add r6, r5, r7
            add r7, r6, r7

            LoopLength7:
                vld1.32 {q8}, [r2]
                vld1.32 {q12}, [r0]!
                vmla.f32 q8, q12, d0[0]
                vld1.32 {q13}, [r1]!
                vmul.f32 q9, q13, d0[1]
                vld1.32 {q12}, [r3]!
                vmla.f32 q8, q12, d1[0]
                vld1.32 {q13}, [r4]!
                vmla.f32 q9, q13, d1[1]
                vld1.32 {q12}, [r5]!
                vmla.f32 q8, q12, d2[0]
                vld1.32 {q13}, [r6]!
                vmla.f32 q9, q13, d2[1]
                vld1.32 {q12}, [r7]!
                vmla.f32 q8, q12, d3[0]

                vadd.f32 q9, q8, q9
                vst1.32 {q9}, [r2]!
                subs r11, r11, #1
                bne LoopLength7

            sub r2, r2, r8
            sub r12, r12, #7
            add r0, r7, r9
            vmov.32 r1, d30[0]
            cmp r12, #7
            bge LoopK7

        pop {r3-r7}

        LoopKStart4:
            cmp r12, #4
            blt LoopKStart3
            vmov.32 d30[1], r3
            vmov.32 d31[0], r4
        LoopK4:
            vld1.32 {d0[0]}, [r1], r10
            vld1.32 {d0[1]}, [r1], r10
            vld1.32 {d1[0]}, [r1], r10
            vld1.32 {d1[1]}, [r1], r10
            mov r11, r6
            vmov.32 d30[0], r1

            add r1, r0, r7
            add r3, r1, r7
            add r4, r3, r7

            LoopLength4:
                vld1.32 {q8}, [r2]
                vld1.32 {q12}, [r0]!
                vmla.f32 q8, q12, d0[0]
                vld1.32 {q13}, [r1]!
                vmul.f32 q9, q13, d0[1]
                vld1.32 {q12}, [r3]!
                vmla.f32 q8, q12, d1[0]
                vld1.32 {q13}, [r4]!
                vmla.f32 q9, q13, d1[1]

                vadd.f32 q9, q8, q9
                vst1.32 {q9}, [r2]!
                subs r11, r11, #1
                bne LoopLength4

            sub r2, r2, r8
            sub r12, r12, #4
            add r0, r4, r9
            vmov.32 r1, d30[0]
            cmp r12, #4
            bge LoopK4

        vmov.32 r3, d30[1]
        vmov.32 r4, d31[0]

        LoopKStart3:
            cmp r12, #3
            blt LoopKStart
            vmov.32 d30[1], r3
            vmov.32 d31[0], r4
        LoopK3:
            vld1.32 {d0[0]}, [r1], r10
            vld1.32 {d0[1]}, [r1], r10
            vld1.32 {d1[0]}, [r1], r10
            mov r11, r6
            vmov.32 d30[0], r1

            add r1, r0, r7
            add r3, r1, r7

            LoopLength3:
                vld1.32 {q8}, [r2]
                vld1.32 {q12}, [r0]!
                vmla.f32 q8, q12, d0[0]
                vld1.32 {q13}, [r1]!
                vmul.f32 q9, q13, d0[1]
                vld1.32 {q12}, [r3]!
                vmla.f32 q8, q12, d1[0]

                vadd.f32 q9, q8, q9
                vst1.32 {q9}, [r2]!
                subs r11, r11, #1
                bne LoopLength3

            sub r2, r2, r8
            sub r12, r12, #3
            add r0, r3, r9
            vmov.32 r1, d30[0]
            cmp r12, #3
            bge LoopK3

        vmov.32 r3, d30[1]
        vmov.32 r4, d31[0]

        LoopKStart:
        cmp r12, #0
        beq LoopKEnd

        LoopK:
            vld1.32 {d30[0]}, [r1], r10

            vdup.32 q15, d30[0]
            mov r11, r6
            LoopLength:
                vld1.32 {q0}, [r2]
                vld1.32 {q1}, [r0]!
                vmla.f32 q0, q1, q15

                vst1.32 {q0}, [r2]!
                subs r11, r11, #1
                bne LoopLength
            subs r12, r12, #1

            sub r2, r2, r8
            add r0, r0, r9
            bne LoopK

        LoopKEnd:
            pop {r0, r1}
            subs r3, r3, #1
            add r0, r0, r8
            add r2, r2, r8
            bne LoopW

    pop {r0, r3}
    add r1, r1, #4 //sizeof(float)
    subs r4, r4, #1
    bne LoopH

    pop {r4-r11, pc}

#endif
